Setup

Load packages

library(data.table)
library(stringr)
library(ggplot2)
theme_py <- theme_light() + theme(
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  panel.border = element_rect(colour = "black", fill = NA),
  text = element_text(size = 20),
  strip.placement = "outside",
  strip.text = element_text(size = 20, color = "black"),
  strip.background = element_rect(fill = "white")
)
theme_set(theme_py)
library(patchwork)
library(ggrepel)
library(ComplexHeatmap)
library(GenomicRanges)
library(openxlsx)
require(ggraph)
require(igraph)
source("scripts/functions.R")
source("../motif-analysis/mta_downstream_functions.R")

Data directories and metadata

atac_dir <- file.path(
  "ATACSEQ", "nucleosome_free_regions", "results"
)
cns_dir <- file.path(
  "ATACSEQ", "nucleosome_free_regions", "consensus_peaks"
)
bnd_dir <- file.path(
  "ATACSEQ", "nucleosome_free_regions", "footprint", "BINDetect"
)
rna_dir <- file.path(
  "RNASEQ_QUANTIFICATION", "results"
)
pub_dir <- file.path("published")
res_dir <- file.path("results")
fig_dir <- file.path("plots")
for (newdir in c(res_dir, fig_dir))
  dir.create(newdir, showWarnings = FALSE)

# metadata
des_fn <- file.path("RNASEQ_QUANTIFICATION", "design_table.tsv")
des_dt <- fread(des_fn)
setnames(des_dt, "reporter_line", "reporterline")
col_dt <- des_dt[, .(sample, reporterline, condition)]
col_dt[, reporterline := factor(
  reporterline,
  levels = c("Elav", "Fox", "Ncol")
)]
col_dt[, condition := factor(condition, levels = c("negative", "positive"))]
setorder(col_dt, reporterline, condition)

# colors
condition_cols <- c("positive" = "blue", "negative" = "red")
line_cols <- c("Elav" = "#ff7f00", "Fox" = "#984ea3", "Ncol" = "#4daf4a")
line_cond_cols <- c(
  "Elav_pos" = "#ff7f00", "Fox_pos" = "#984ea3", "Ncol_pos" = "#4daf4a",
  "Elav_neg" = "#ebbd8f", "Fox_neg" = "#d18adb", "Ncol_neg" = "#90d18e"
)

Gene annotations and lists

marks_gold <- fread(
  file.path("annotation", "golden-marks-231124.tsv"),
  fill = TRUE, sep = "\t", header = FALSE
)[, c(2:1)]
setnames(marks_gold, c("gene", "name"))
marks_gold_v <- structure(marks_gold$name, names = marks_gold$gene)

tfs_annt <- fread(
  file.path("annotation", "tfs.Nvec.tsv"),
  header = TRUE
)[, .SD[1], gene]
setnames(tfs_annt, c("gene", "name", "pfam"))

gene_annt <- fread(
  file.path("annotation", "Nvec_annotation_v3_2020-10-23_DToL_names"),
  select = 1:3
)
setnames(gene_annt, c("gene", "pfam", "name"))

gene_ann <- rbindlist(list(
  gene_annt[!gene %in% tfs_annt$gene],
  tfs_annt
), use.names = TRUE)
gene_ann[gene %in% marks_gold$gene, name := marks_gold_v[gene]]

Load normalized gene expression (RNA-seq) and motif accessibility (ATAC-seq) data

# GENES

# size norm gene expression
exps_mt <- read.table(
  file.path(rna_dir, "mat_norm.tsv")
)

# size and gene norm gene expression
expr_mt <- read.table(
  file.path(rna_dir, "norm_expression.tsv")
)

# gene expression TPMs
tpms_mt <- read.table(
  file.path(rna_dir, "tpm_expression.tsv")
)

# gene clusters
gene_clust <- fread(
  file.path(rna_dir, "genes_clusters.tsv")
)

# PEAKS

# peak accessibility
accs_mt <- read.table(
  file.path(atac_dir, "mat_norm.tsv")
)

# peak to gene assignment
assign <- fread(
  file.path(atac_dir, "consensusSeekeR-peaks-gene-assignment.tsv")
)

# peak clusters
pks_clust <- fread(
  file.path(atac_dir, "consensusSeekeR-peaks-clusters.bed")
)
bed_cols <- c(
  "seqnames", "start", "end", "peak", "score", "strand", "cluster", "group"
)
setnames(pks_clust, bed_cols)

# MOTIFS

# motif enrichment scores
motf_dt <- fread(
  file.path(atac_dir, "motif-enrich-archetypes-mona.tsv")
)
setnames(motf_dt, "label", "group")

# motif enrichment qvalue
motf_qv <- read.table(
  file.path(atac_dir, "motif-enrich-archetypes-mona-qvalue.tsv"),
  sep = "\t"
)

# motif enrichment log2fc
motf_fc <- read.table(
  file.path(atac_dir, "motif-enrich-archetypes-mona-fc.tsv"),
  sep = "\t"
)

# motif to tf assignment
dc <- fread(file.path(
  "annotation",
  "assignment-archetype-motif-gene.tsv.gz"
))[, .(gene, archetype_name)]
dc[, motif := str_extract(archetype_name, "ARCH\\d+")]
dc[, archetype_name := NULL]

Correlation between motif binding and gene expression

Load fold changes for expression and motif binding

# binding comparisons fold changes for best correlated motifs
dp_dt <- fread(file.path(
  bnd_dir, "bindetect_results_reporterline.txt"
))
dp_dt[, id := paste(reporterline, vs, sep = "_vs_")]
dp_dt[, motif_gene := paste(motif, gene, sep = "__")]
dt_atac <- dcast.data.table(
  dp_dt, motif_gene ~ id, value.var = "log2FoldChange"
)
mt_atac <- as.matrix(dt_atac[, -1])
rownames(mt_atac) <- dt_atac$motif_gene

# RNA comparisons fold changes
mt_rna <- read.table(
  file.path(rna_dir, "log2fc_expression.tsv"),
  sep = "\t",
  header = TRUE
)

Dot plot of expression and binding score

dt_rna_plot <- melt.data.table(
  as.data.table(mt_rna, keep.rownames = "gene"),
  id.vars = "gene", variable.name = "comparison", value.name = "rna_fc"
)
dt_atac_plot <- melt.data.table(
  as.data.table(mt_atac, keep.rownames = "motif_gene"),
  id.vars = "motif_gene", variable.name = "comparison", value.name = "atac_fc"
)
dt_atac_plot[, gene := str_remove(motif_gene, "ARCH\\d+__")]
dt_atac_plot[, motif := str_extract(motif_gene, "ARCH\\d+")]
dt_comb <- merge.data.table(
  dt_rna_plot, dt_atac_plot,
  by = c("gene", "comparison")
)

# filtering
ids_rna <- rownames(mt_rna)[apply(mt_rna, 1, function(x) max(x) > 1.5)]
ids_atac <- rownames(mt_atac)[apply(mt_atac, 1, function(x) max(x) > 0.1)]
dt_plot <- dt_comb[gene %in% ids_rna & motif_gene %in% ids_atac]

# add gene annotation
dt_plot <- merge.data.table(
  dt_plot, gene_ann, by = "gene",
  all.x = TRUE, sort = FALSE
)

# label for plotting
dt_plot[, label := paste(
  substr(name, 1, 30), gene, motif,
  sep = " | "
)]

# clustering
tfs <- unique(dt_plot$gene)
hc <- hclust(dist(mt_rna[tfs, ]), method = "ward.D2")
ord <- tfs[hc$order]

# ordering
tfs <- unique(dt_plot$gene)
mord <- order(apply(mt_rna[tfs,], 1, which.max))
ord <- tfs[mord]

# order genes
dt_plot[, gene := factor(gene, levels = rev(ord))]
setorder(dt_plot, gene)
dt_plot[, label := factor(label, levels = unique(dt_plot$label))]
setorder(dt_plot, label)

# limits for plotting
dt_plot[rna_fc < 0, rna_fc := 0]
dt_plot[rna_fc > 8, rna_fc := 8]

# plot
require(ggplot2)
require(scales)
gp_dot <- ggplot(dt_plot, aes(comparison, label)) +
  geom_point(
    aes(size = rna_fc, fill = atac_fc),
    shape = 21,
    color = "black"
  ) +
  scale_fill_gradientn(
    colours = colorRampPalette(RColorBrewer::brewer.pal(11, "BrBG"))(1000),
    # limits = c(-5, 5), oob = squish, breaks = c(-5, 0, 5),
    name = "motif binding\nlog2(fold change)",
  ) +
  scale_size_continuous(
    # range = c(1, 7),
    # breaks = c(0, 2, 4, 6),
    name = "gene expression\nlog2(fold change)"
  ) +
  theme(
    panel.grid.major = element_line(size = 0.25),
    axis.text.y = element_text(size = 8),
    axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1),
    legend.direction = "vertical"
  )
gp_dot

Select correlated genes

mt_bnd <- copy(mt_atac)
rownames(mt_bnd) <- str_remove(rownames(mt_bnd),  "ARCH\\d+__")
mt_exp <- mt_rna[rownames(mt_bnd), colnames(mt_bnd)]
rownames(mt_bnd) <- rownames(mt_atac)
rownames(mt_exp) <- rownames(mt_atac)

# correlation between same comparisons (columns)
mt_cors_smp <- cor(mt_bnd, mt_exp, method = "spearman")
diag(mt_cors_smp)
##  Elav_vs_Fox Elav_vs_Ncol  Elav_vs_neg  Fox_vs_Elav  Fox_vs_Ncol   Fox_vs_neg 
##    0.1799928    0.1058748    0.3073588    0.1799928    0.2047478    0.2076299 
## Ncol_vs_Elav  Ncol_vs_Fox  Ncol_vs_neg 
##    0.1058748    0.2047478    0.1486817
# correlation between genes (rows)
mt_cors_gen <- cor(t(mt_bnd), t(mt_exp), method = "spearman")
cors_gen <- diag(mt_cors_gen)
names(cors_gen) <- rownames(mt_cors_gen)
cors_gen <- sort(cors_gen, decreasing = TRUE)

# select correlated genes
select_pos <- names(cors_gen[(cors_gen) > 0.5])

# correlation distribution
dt_cors <- data.table(cor = cors_gen)
dt_cors$motif_gene <- names(cors_gen)
dt_cors[, correlation := "no"]
dt_cors[motif_gene %in% select_pos, correlation := "positive"]
dt_cors[, motif := str_extract(motif_gene, "ARCH\\d+")]
dt_cors[, gene := str_remove(motif_gene, "ARCH\\d+__")]

# plot
gp_hist <- ggplot(dt_cors, aes(cor, fill = correlation)) +
  geom_histogram(color = "black", binwidth = 0.05) +
  scale_y_continuous(expand = expansion(c(0, 0.1))) +
  scale_fill_manual(values = c(
    "positive" = "green",
    "no" = "grey"
  )) +
  labs(x = "correlation", y = "number of genes") +
  theme(legend.position = "none")
gp_hist

Inspect correlations between gene expression and motif binding score per sample

mt_ord <- cbind(mt_exp, mt_bnd)
colnames(mt_ord) <- paste(
  colnames(mt_ord),
  rep(c("RNA", "ATAC"), each = 9),
  sep = "_"
)
mt_dt <- as.data.table(mt_ord, keep.rownames = "motif_gene")
mt_dt[, motif := str_extract(motif_gene, "ARCH\\d+")]
mt_dt[, gene := str_remove(motif_gene, "ARCH\\d+__")]
mt_dt[, motif_gene := NULL]
dt_atac <- melt.data.table(
  mt_dt,
  id.vars = c("gene", "motif"),
  measure.vars = grep("ATAC", colnames(mt_dt), value = TRUE),
  value.name = "ATAC", variable.name = "comparison"
)
dt_atac[, comparison := str_remove(comparison, "_ATAC")]

dt_rna <- melt.data.table(
  mt_dt,
  id.vars = "gene",
  measure.vars = grep("RNA", colnames(mt_dt), value = TRUE),
  value.name = "RNA", variable.name = "comparison"
)
dt_rna[, comparison := str_remove(comparison, "_RNA")]

dt_comp <- merge.data.table(
  dt_rna, dt_atac,
  by = c("gene", "comparison"), all = TRUE
)
setcolorder(dt_comp, c("motif", "gene", "ATAC", "RNA"))
dt_comp[, reporterline := str_extract(comparison, "Elav|Fox|Ncol")]
dt_comp[, vs := str_extract(comparison, "vs.*")]
dt_comp[, vs := str_remove(vs, "vs_")]
dt_comp <- merge.data.table(
  dt_comp, dt_cors, by = c("motif", "gene"),
  all.x = TRUE, sort = FALSE
)

# add gene annotation
dt_comp <- merge.data.table(
  dt_comp, gene_ann, by = "gene",
  all.x = TRUE, sort = FALSE
)

# save
fwrite(
  dt_comp,
  file.path(res_dir, "correlation_expression_binding.tsv"),
  sep = "\t"
)

# label for plotting
dt_comp[, label := substr(name, 1, 20)]

# posterboys to label
dt_lab <- dt_comp[cor > 0.5 & abs(RNA) > 1 & abs(ATAC) > 0.1]

# plot
gp_comp <- ggplot(dt_comp, aes(RNA, ATAC, color = correlation)) +
  geom_point(data = dt_comp[correlation == "no"], alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE) +
  geom_text_repel(
    data = dt_lab[ATAC > 0],
    aes(label = label),
    color = "black",
    size = 3,
    segment.color = "grey50",
    min.segment.length = 0,
    nudge_y = 0.5 - dt_lab[ATAC > 0]$ATAC,
  ) +
  geom_text_repel(
    data = dt_lab[ATAC < 0],
    aes(label = label),
    color = "black",
    size = 3,
    segment.color = "grey50",
    min.segment.length = 0,
    nudge_y = -0.5 - dt_lab[ATAC < 0]$ATAC,
  ) +
  geom_point(data = dt_comp[correlation != "no"], alpha = 0.8) +
  scale_color_manual(values = c(
    "positive" = "green",
    "no" = "grey"
  )) +
  scale_y_continuous(expand = c(0.1, 0.1)) +
  facet_grid(vs ~ reporterline) +
  theme(legend.position = "bottom")
gp_comp
## `geom_smooth()` using formula = 'y ~ x'
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 32 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 32 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Gene networks

Load baseline networks from BINDetect bound targets that are also expressed in given sample, and subset for those TFs for which binding and expression are correlated.

# binding targets
mo_dt <- fread(
  file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
mo_dt <- mo_dt[expressed == TRUE & target_expressed == TRUE]
mo_dt <- mo_dt[grep("pos", sample)]
mo_dt <- mo_dt[expression_lfc > 0]
mo_dt[, reporterline := str_remove(sample, "_pos")]

# binding correaltions with expression
dt_comp <- fread(
  file.path(res_dir, "correlation_expression_binding.tsv")
)
dt_comp_cor <- unique(
  dt_comp[correlation != "no"][, .(motif, gene, reporterline)]
)

# select correlated motifs
mo_dt_cor <- merge.data.table(
  dt_comp_cor, mo_dt, by = c("motif", "gene", "reporterline"), sort = FALSE
)

# add TF-target gene expression correlation
g_cors_dt <- unique(mo_dt_cor[, .(gene, target_gene)])
g_cors_dt <- g_cors_dt[gene %in% rownames(tpms_mt)]
g_cors_dt <- g_cors_dt[target_gene %in% rownames(tpms_mt)]
g_cors_dt[, expression_correlation := cor(
  unlist(tpms_mt[gene, ]),
  unlist(tpms_mt[target_gene, ])
), by = 1:nrow(g_cors_dt)]
mo_dt_cor <- merge.data.table(
  mo_dt_cor, g_cors_dt, by = c("gene", "target_gene"),
  all.x = TRUE, sort = FALSE
)
mo_dt_cor <- mo_dt_cor[!is.na(expression_correlation)]
mo_dt_cor[, c("expressed", "target_expressed") := NULL]

# save
fwrite(
  mo_dt_cor,
  file.path(res_dir, "correlation_network.tsv.gz"),
  sep = "\t"
)

Filtering by expression

mo_dt_cor <- fread(file.path(res_dir, "correlation_network.tsv.gz"))
mo_dt_cor[, reporterline := NULL]
mo_dt_cor[, target_TF := ifelse(target_gene %in% .SD$gene, TRUE, FALSE), sample]
mo_dt_cor[, name := substr(name, 1, 30)]
mo_dt_cor[, pfam := substr(pfam, 1, 30)]
mo_dt_cor[, target_name := substr(target_name, 1, 30)]
mo_dt_cor[, target_pfam := substr(target_pfam, 1, 30)]
mo_dt_cor <- mo_dt_cor[expression_lfc > 1.5][target_expression_lfc > 1]
setcolorder(mo_dt_cor, c(
  "sample", "gene", "name", "pfam",
  "expression_tpm", "expression_lfc",
  "motif", "motif_name",
  "target_gene", "target_name", "target_pfam",
  "target_expression_tpm", "target_expression_lfc", 
  "expression_correlation", "n_samples"
))

How many target genes for each TF?

hits_dt <- mo_dt_cor[, .N, .(gene, motif, sample)]
hits_gp <- ggplot(hits_dt, aes(sample, N, fill = sample)) +
  geom_boxplot(outlier.color = NA) +
  ggbeeswarm::geom_beeswarm(shape = 21) +
  scale_y_continuous(trans = "log10") +
  scale_fill_manual(values = line_cond_cols) +
  labs(y = "target genes per TF") +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    legend.position = "none",
    panel.grid.major.y = element_line(linewidth = 0.5),
    panel.grid.minor.y = element_line(linewidth = 0.2)
  )
hits_gp

Save network files

require(openxlsx)
wb <- createWorkbook()
for (smp in c("Elav", "Ncol", "Fox")) {
  smp_fn <- file.path(res_dir, sprintf("correlation_network_%s.tsv.gz", smp))
  smp_dt <- mo_dt_cor[sample == paste0(smp, "_pos")]
  message("Saving: ", smp_fn)
  fwrite(smp_dt, smp_fn, sep = "\t")
  # all genes
  addWorksheet(wb, sheetName = smp)
  writeData(wb, sheet = smp, smp_dt)
  # TF-TF
  tfs_dt <- smp_dt[target_TF == TRUE]
  tfs_orph <- smp_dt[!gene %in% tfs_dt$gene, .SD[1], gene]
  tfs_orph[, target_gene := "none"]
  tfs_orph[, c("target_name", "target_pfam") := ""]
  tfs_orph[, c("target_expression_tpm", "target_expression_lfc", "n_samples", "expression_correlation") := 0]
  tfs_dt <- rbindlist(list(tfs_dt, tfs_orph), use.names = TRUE)
  addWorksheet(wb, sheetName = paste0(smp, "_TF"))
  writeData(wb, sheet = paste0(smp, "_TF"), tfs_dt)
}
## Saving: results/correlation_network_Elav.tsv.gz
## Saving: results/correlation_network_Ncol.tsv.gz
## Saving: results/correlation_network_Fox.tsv.gz
saveWorkbook(
    wb,
    file.path(res_dir, "correlation_network.xlsx"),
    overwrite = TRUE
)

Cnidocytes network

Comparison with Ozment et al. 2021 eLife paper.

# load data from paper and translate gene IDs to DToL IDs
pou_act <- readWorkbook(file.path(pub_dir, "elife-74336-supp6-v3.xlsx"), startRow = 2, colNames = TRUE)
pou_rep <- readWorkbook(file.path(pub_dir, "elife-74336-supp7-v3.xlsx"), startRow = 2, colNames = TRUE)
setDT(pou_act)
setDT(pou_rep)
pou_pub <- rbindlist(list(activated = pou_act, repressed = pou_rep), idcol = "relation")
setnames(pou_pub, "gene", "ID_Vienna")
dict <- fread(file.path("annotation", "DICTIONARY_Nvec_vienna_vs_Nvec_old_vs_Nvec_DTOL_v2.txt"))[, .(ID_Vienna, ID_DToL)]
setnames(dict, "ID_DToL", "gene")
pou_pub <- merge.data.table(pou_pub, dict, all.x = TRUE, sort = FALSE)
pou_pub <- pou_pub[, .(gene, relation, fold_change, mean_counts, pvalue)]
setnames(pou_pub, "gene", "target_gene")

# load our Ncol network
pou4 <- "Nvec_vc1.1_XM_032363992.2"
ncol_dt <- fread(file.path(res_dir, "correlation_network_Ncol.tsv.gz"))
pou_net <- ncol_dt[gene == pou4]

# overlap of Pou4 targets
pou_pub_act <- pou_pub[relation == "activated", .(target_gene, fold_change)]
setnames(pou_pub_act, "fold_change", "activated")
pou_pub_rep <- pou_pub[relation == "repressed", .(target_gene, fold_change)]
setnames(pou_pub_rep, "fold_change", "repressed")
pou_net_gen <- pou_net[, .(target_gene, target_expression_lfc)]
setnames(pou_net_gen, "target_expression_lfc", "network")
pou_dt <- merge.data.table(merge.data.table(
  pou_pub_act, pou_pub_rep,
  by = "target_gene", all = TRUE
), pou_net_gen, by = "target_gene", all = TRUE)
pou_mt <- as.matrix(pou_dt[, -1])
pou_mt[!is.na(pou_mt)] <- 1
pou_mt[is.na(pou_mt)] <- 0

require(eulerr)
fit <- euler(pou_mt)
p1 <- plot(
  fit,
  quantities = TRUE
)
print(p1)